#include "statsxx/machine_learning/Ensemble.hpp"

// STL
#include <cmath>               // std::log(), ::exp()
#include <fstream>             // std::ofstream
#include <limits>              // std::numeric_limits<>::lowest()
#include <string>              // std::string
#include <vector>              // std::vector<>

// jScience
#include "jScience/linalg.hpp" // Matrix<>, Vector<>, normalize()
#include "jrandnum.hpp"        // rand_num_uniform_Mersenne_twister()

// stats++
#include "statsxx/machine_learning/loss_functions.hpp" // Brier_score()


//
// DESC: Use Bayesian Model Combination (BMC) to determine the weightings of each Learner.
//
// INPUT:
//
//     int N :: number of weightings to sample (smaller == faster, bigger == more precise results)
//              NOTE: Wikipedia had suggested that 100 MIGHT be a reasonable value.
//
// -----
//
// NOTE: BMC (like BMA) seeks to approximate the Bayes Optimal Classifier.
//
// NOTE: The space of all possible ensembles (linear combination of learners) is sampled to find the one which maximizes the probability that the training data was generated from it.
//
// NOTE: This algorithm is supposed to be an algorithmic correction to BMA, and has been shown [in some case(s)] to outperform both it and bagging with statistical significance.
//
// -----
//
// NOTE: In a previous version (NeuralNetEnsemble), I NOTEd that overfitting with training DOES NOT seem occur with this approach.
//
// -----
//
// NOTE: See:
//
//     Monteith, K. et al., ``Turning Bayesian Model Averaging into Bayesian Model Combination,'' Proceedings of the International Joint Conference on Neural Networks IJCNN'11. pp. 2657–2663.
//
// NOTE: ... The implementation below is based on this paper, and also:
//
//     Algorithm 19.2 in: C. C. Aggarwal, "Data Classification: Algorithms and Applications"
//
// NOTE: ... And also an earlier version of the Wikipedia page:
//
//     https://en.wikipedia.org/wiki/Ensemble_learning
//
// NOTE: See also:
//
//     T. P. Minka, ``Bayesian model averaging is not model combination'' (2002).
//
// -----
//
// NOTE: See also the NOTEs in ::optimize_BMA().
//
template<typename T>
inline void Ensemble<T>::optimize_BMC(
                                      const Matrix<double> &X_in,
                                      const Matrix<double> &X_out,
                                      // -----
                                      const int             N,
                                      // -----
                                      const std::string     prefix
                                      )
{
    //=========================================================
    // INITIALIZATION
    //=========================================================

    // SETUP OUTPUT FILE
    std::ofstream ofs( (prefix + ".BMC.dat") );

    // SET THE INITIAL WEIGHTS TO 0
    std::vector<double> _w(this->learners.size(), 0.);

    double sum_w = 0.;

    double z = std::numeric_limits<double>::lowest(); // used to maintain numerical stability

    // CALCULATE OUTPUT FROM INDIVIDUAL LEARNERS
    std::vector<Matrix<double>> X_out_i(this->learners.size(), Matrix<double>(X_in.size(0),X_out.size(1)));

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        for( auto j = 0; j < X_in.size(0); ++j )
        {
            std::vector<double> output = this->learners[i].f_evaluate(X_in.row(j).std_vector());

            for( auto k = 0; k < X_out.size(1); ++k )
            {
                X_out_i[i](j,k) = output[k];
            }
        }
    }

    //=========================================================
    // SAMPLE
    //=========================================================
    for( int i = 0; i < N; ++i )
    {
        //---------------------------------------------------------
        // DRAW MODEL FROM A UNIFORM DIRICHLET DISTRIBUTION
        //---------------------------------------------------------
        //
        // NOTE: The assumption is that the learners are distributed according to some distribution.
        //
        // NOTE: The purpose of BMC is to select (from this) the optimal one.

        std::vector<double> v;

        for( auto j = 0; j < this->learners.size(); ++j )
        {
            v.push_back( (-std::log(rand_num_uniform_Mersenne_twister(0., 1.))) );
        }

        v = normalize(Vector<double>(v)).std_vector();

        //---------------------------------------------------------
        // CALCULATE THE LIKELIHOOD OF THIS MODEL (v), GIVEN THE DATA
        //---------------------------------------------------------

        // CALCULATE OUTPUT OF MODEL
        this->w = v;

        Matrix<double> output = this->combine_output(
                                                     X_out_i
                                                     );

        // CALCULATE THE PROBABILITY OF CLASS CORRUPTION
        double eps = loss_function::Brier_score(
                                                output,
                                                X_out
                                                );

        // ... AND USE THIS TO *ESTIMATE* THE LOG-LIKELIHOOD
        double log_likelihood = X_in.size(0)*( eps*std::log(eps) + (1. - eps)*std::log(1. - eps) );

        // NOTE: The BMA formula used to estimate p(h|D) is used in the same way to estimate p(e|D).
        //
        // NOTE: ... See the discussion in Monteith, et al. on p. 2660.

        //---------------------------------------------------------
        // UPDATE WEIGHTS
        //---------------------------------------------------------

        // NOTE: In an earlier version (based on Wikipedia), the following was based on the log_likelihood.
        //
        // NOTE: The following in Aggarwal though is based on the likelihood.
        //
        // NOTE: ... It seems that the latter cannot be correct though, considering that we want to add models based on their likelihood [and not exp(likelihood)].

        if( log_likelihood > z )
        {
            for( auto &_wi : _w )
            {
                _wi *= std::exp( (z - log_likelihood) );
            }

            z = log_likelihood;
        }

        double ww = std::exp( (log_likelihood - z) );

        for( auto j = 0; j < _w.size(); ++j )
        {
            // NOTE: The first term is used to renormalize the (running) weights, ...
            //
            // NOTE: ... the second adds the weights of this model times its likelihood p(e|D).
            _w[j] = _w[j]*sum_w/(sum_w + ww) + ww*v[j];
        }

        sum_w += ww;

        //---------------------------------------------------------

        // NOTE: The following is a different (but more understandable) formula than Aggarwal.
        //
        // NOTE: ... This adds models (weights) according to their likelihood p(e|D), and keeps the weights normalized.
        //
        // NOTE: IMPORTANT: It is not clear that this is a correct approach, however.
        //
        // NOTE: It leads to *similar* results, with different (less smooth convergence) behavior.

//        for( auto j = 0; j < _w.size(); ++j )
//        {
//            _w[j] = ( sum_w*_w[j] + likelihood*v[j] )/(sum_w + likelihood);
//        }
//
//        sum_w += likelihood;

        //---------------------------------------------------------

        //---------------------------------------------------------
        // OUTPUT
        //---------------------------------------------------------
        //
        // NOTE: This requires additional computation, but allows monitoring of BMC.

        // NORMALIZE WEIGHTS
        std::vector<double> tmp_w = normalize(Vector<double>(_w)).std_vector();

        // CALCULATE OUTPUT OF MODEL
        this->w = tmp_w;

        output = this->combine_output(
                                      X_out_i
                                      );

        // CALCULATE PREDICTIVE ACCURACY
        double x = 1. - loss_function::Brier_score(
                                                   output,
                                                   X_out
                                                   );

        ofs << i << " " << x << '\n';
    }

    // NORMALIZE WEIGHTS
    _w = normalize(Vector<double>(_w)).std_vector();

    // ASSIGN TO ENSEMBLE
    this->w = _w;

    //=========================================================
    // CLEANUP AND RETURN
    //=========================================================

    ofs.close();
}
